In [None]:
import pytz
import datetime
import marimo as mo

india_timezone = pytz.timezone("Asia/Kolkata")
now = datetime.datetime.now(india_timezone)

curr = now.strftime("%Y-%m-%d, %I:%M:%S %p %Z")

mo.md(
    rf"""
# Week - 5

The last column of the dataset (Credit_Limit) corresponds to the target variable. Separate the dataset to have the feature matrix as X and the target variable as y. Use random_state = 42 wherever applicable.


Split the dataset into train and test set with a **70:30 ratio** and `random_state = 42`. Use these to answer all the questions that follow. 

**Submission Date:** `2025-10-31, 23:59 IST`

**Last Updated:** `{curr}`
"""
)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

In [None]:
df = pd.read_csv("Week-5/Graded Assignment/dataset.csv")

In [None]:
df.sample(5)

In [None]:
df.info()
df.describe().T

In [None]:
_cols = df.select_dtypes(include="float").columns.tolist()
for col in _cols:
    print(f"{col}: {len(df[col].unique())}")

In [None]:
X = df.drop("Credit_Limit", axis=1)
y = df["Credit_Limit"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Question 1 - 2

Train a Linear Regression model on the training dataset with fit_intercept = False and compute ‘r2_score’ on the test dataset

In [None]:
lr_model = LinearRegression(fit_intercept=False)

In [None]:
lr_model.fit(X_train, y_train)

### Question 1

What is the value of the r2_score obtained on the test dataset? [Enter 3 decimal places]

In [None]:
_y_pred = lr_model.predict(X_test)

np.round(r2_score(y_test, _y_pred), 3)

### Question 2

What is the index of the feature with the highest absolute coefficient value?

**Note:** that index starts from 0

In [None]:
lr_model.coef_.argmax()

## Question 3 - 4

Train a Ridge model on the training data with the following parameters:

- solver = ‘sag’
- tol = 0.0005
- random_state = 42

Compute the ‘**r2_score**’ on the test data

In [None]:
rd_model = Ridge(solver="sag", tol=0.0005, random_state=42)

In [None]:
rd_model.fit(X_train, y_train)

### Question 3

What is the value of the **r2_score** obtained on the test dataset? [Enter 3 decimal places]

In [None]:
_y_pred = rd_model.predict(X_test)

np.round(r2_score(y_test, _y_pred), 3)

### Question 4

Enter the value of the `intercept_` attribute obtained after training the model. [Enter 3 decimal places]

In [None]:
np.round(rd_model.intercept_, 3)

## Question 5 - 6

Train a Lasso model on the training data with the following parameters:

- alpha = 100
- random_state = 42

Compute the ‘**r2_score**’ on the test data

In [None]:
ls_model = Lasso(alpha=100, random_state=42)

In [None]:
ls_model.fit(X_train, y_train)

### Question 5

What is the value of the **r2_score** obtained on the test dataset? [Enter 3 decimal places]

In [None]:
_y_pred = ls_model.predict(X_test)

np.round(r2_score(y_test, _y_pred), 3)

### Question 6

How many coefficients have a value in the range [-1, 1]?

In [None]:
_coef = ls_model.coef_

len(_coef[(_coef >= -1) & (_coef <= 1)])

## Question 7

Train a KNeighborsRegressor with the following parameters:

- n_neighbors = 10
- p = 1

Predict the target for the test data and compute the root_mean_squared_error(RMSE) on the same

In [None]:
knr_model = KNeighborsRegressor(n_neighbors=10, p=1)

In [None]:
knr_model.fit(X_train, y_train)

### Question 7

What is the value of the **rmse** obtained on the test dataset? [Enter 3 decimal places]

In [None]:
_y_pred = knr_model.predict(X_test)

np.round(root_mean_squared_error(y_test, _y_pred), 3)

## Question 8

Train a Decision Tree Regressor with the following parameters:

- max_depth = 10
- min_samples_split = 6
- min_samples_leaf = 6
- random_state = 42

Predict the target for the test data and compute the `root_mean_squared_error`(RMSE) on the same

In [None]:
dtr_model = DecisionTreeRegressor(
    max_depth=10, min_samples_split=6, min_samples_leaf=6, random_state=42
)

In [None]:
dtr_model.fit(X_train, y_train)

### Question 8

What is the value of the **rmse** obtained on the test dataset? [Enter 3 decimal places]

In [None]:
_y_pred = dtr_model.predict(X_test)

np.round(root_mean_squared_error(y_test, _y_pred), 3)

## Question 9 - 10

Perform Hyperparameter Tuning using GridSearchCV on AdaBoostRegressor. Use `random_state = 42`. Value of `cv = 4`. Hyperparameter tuning is to be done over:

- n_estimators = [10,50,100,200,500]
- learning_rate = [0.1,0.5,1,2]

Use the best model obtained to compute the score on the test data

In [None]:
params_grid = {
    "n_estimators": [10, 50, 100, 200, 500],
    "learning_rate": [0.1, 0.5, 1, 2],
}

In [None]:
adaboost = AdaBoostRegressor(random_state=42)

In [None]:
grid_search = GridSearchCV(
    estimator=adaboost, param_grid=params_grid, cv=4, n_jobs=-1, scoring="r2"
)

In [None]:
grid_search.fit(X_train, y_train)

### Question 9

What is the value of the r2_score obtained on the test dataset? [Enter 3 decimal places]

In [None]:
_y_pred = grid_search.predict(X_test)

np.round(r2_score(y_test, _y_pred), 3)

### Question 10

What is the value of `n_estimators` obtained for the best model after training with **GridSearchCV**?

In [None]:
grid_search.best_params_